Dataset Preprocessing¶

Library loading¶

In [1]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import yaml
import os
import warnings
import socket
import matplotlib as plt

warnings.filterwarnings('ignore')
In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
with open("../data/resources/rcParams.yaml") as f:
    rcParamsDict = yaml.full_load(f)
    for k in rcParamsDict["rcParams"]:
        print("{} {}".format(k,rcParamsDict["rcParams"][k]))
        plt.rcParams[k] = rcParamsDict["rcParams"][k]
    for k1 in set(list(rcParamsDict)).difference(set(["rcParams"])):
        print("{} {}".format(k1,rcParamsDict[k1]))
scanpy==1.8.0 anndata==0.8.0 umap==0.4.6 numpy==1.22.2 scipy==1.6.2 pandas==1.2.3 scikit-learn==0.24.1 statsmodels==0.13.5 python-igraph==0.9.1 louvain==0.7.0 leidenalg==0.8.3
figure.dpi 50
savefig.dpi 500
figure.figsize [10, 10]
axes.facecolor white
dotSize 20
In [3]:
DSname="DownD50"
DSDirName="Sample_S20273_158"

Configure paths¶

In [4]:
outdir = "../data/output"
if not os.path.exists(outdir):
   # Create a new directory because it does not exist
   os.makedirs(outdir)
    
if not os.path.exists("../data/output/adatas"):
   # Create a new directory because it does not exist
   os.makedirs("../data/output/adatas")

with open("../data/resources/iPSC_lines_map.yaml", 'r') as f:
    iPSC_lines_map = yaml.load(f, Loader=yaml.FullLoader)["lines"]
results_file = outdir+'/'+DSname+'.h5ad'
In [5]:
scanpyObj = sc.read_10x_mtx('../data/'+DSDirName+'/filtered_feature_bc_matrix/',  # the directory with the `.mtx` file
    var_names='gene_symbols',                # use gene symbols for the variable names (variables-axis index)
    cache=True)                              # write a cache file for faster subsequent reading

scanpyObj.obs['dataset'] = DSname
scanpyObj.obs_names = [i + "_" + j for i, j in zip(scanpyObj.obs_names.tolist(), scanpyObj.obs["dataset"].tolist())]
... reading from cache file cache/..-data-Sample_S20273_158-filtered_feature_bc_matrix-matrix.h5ad
In [6]:
scanpyObj.var_names_make_unique()
scanpyObj.obs_names_make_unique()

Importing cellIDs¶

In [7]:
cellID = pd.read_csv('../data/'+DSDirName+'/aggregatedCall/aggregatedCall.tsv', sep ="\t",index_col = 0)
cellID
cellID.index = [i + "_" + DSname for i in cellID.index.tolist()]
In [8]:
cellID.Consensus.unique()
Out[8]:
array(['3391B', 'SA2', 'DESTa', 'MIFF1', 'KOLF', 'A15441', 'doublet',
       'LowQuality'], dtype=object)
In [9]:
scanpyObj.obs['cellID'] = cellID.loc[scanpyObj.obs_names, 'Consensus']
In [10]:
scanpyObj.obs
Out[10]:
dataset cellID
AAACCTGAGACCCACC-1_DownD50 DownD50 LowQuality
AAACCTGAGACTAGAT-1_DownD50 DownD50 3391B
AAACCTGAGAGCCTAG-1_DownD50 DownD50 LowQuality
AAACCTGAGAGTTGGC-1_DownD50 DownD50 3391B
AAACCTGAGCTATGCT-1_DownD50 DownD50 SA2
... ... ...
TTTGTCATCCGCGGTA-1_DownD50 DownD50 KOLF
TTTGTCATCCGCGTTT-1_DownD50 DownD50 doublet
TTTGTCATCTCACATT-1_DownD50 DownD50 DESTa
TTTGTCATCTGATACG-1_DownD50 DownD50 MIFF1
TTTGTCATCTTTCCTC-1_DownD50 DownD50 A15441

17459 rows × 2 columns

Configure colors¶

In [11]:
cellID_colors = {}
cellID_newName_colors = {}
cellID_newNames = {}


for line in iPSC_lines_map.keys():
    cellID_colors[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["color"]
    cellID_newName_colors[iPSC_lines_map[line]["newName"]] = iPSC_lines_map[line]["color"]
    cellID_newNames[iPSC_lines_map[line]["oldName"]] = iPSC_lines_map[line]["newName"]

scanpyObj.obs["cellID"] = scanpyObj.obs["cellID"].astype("category")
scanpyObj.obs["cellID_newName"] = scanpyObj.obs["cellID"].replace(cellID_newNames, inplace=False).astype("category")
scanpyObj.uns["cellID_colors"] = [cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
scanpyObj.uns["cellID_newName_colors"] = [cellID_newName_colors[line] for line in scanpyObj.obs["cellID_newName"].cat.categories]
In [12]:
[cellID_colors[line] for line in scanpyObj.obs["cellID"].cat.categories]
Out[12]:
['#DBB807',
 '#64f5d9',
 '#FF00CC',
 '#0FB248',
 '#a8a5a5',
 '#7B00FF',
 '#99FF00',
 '#403b3b']

Preprocessing¶

Show those genes that yield the highest fraction of counts in each single cells, across all cells.

In [13]:
sc.pl.highest_expr_genes(scanpyObj, n_top=20 )
normalizing counts per cell
    finished (0:00:00)

Subsetting according to good barcodes¶

In [14]:
goodBarcodes=pd.read_csv(outdir+'/'+DSname+'_filteredCells.tsv', sep="\t")["BCs"]+"_"+DSname
scanpyObj = scanpyObj[goodBarcodes,:]

QC metrices¶

In [15]:
scanpyObj.var['mt'] = scanpyObj.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['mt'], percent_top=None, log1p=True, inplace=True)

scanpyObj.var['ribo'] = scanpyObj.var_names.str.startswith('RP')  # annotate the group of ribosomal genes as 'ribo'
sc.pp.calculate_qc_metrics(scanpyObj, qc_vars=['ribo'], percent_top=None, log1p=True, inplace=True)
In [16]:
scanpyObj
Out[16]:
AnnData object with n_obs × n_vars = 5027 × 33538
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo'
    uns: 'cellID_colors', 'cellID_newName_colors'
In [17]:
scanpyObj.var
Out[17]:
gene_ids feature_types mt n_cells_by_counts mean_counts log1p_mean_counts pct_dropout_by_counts total_counts log1p_total_counts ribo
MIR1302-2HG ENSG00000243485 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM138A ENSG00000237613 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
OR4F5 ENSG00000186092 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AL627309.1 ENSG00000238009 Gene Expression False 5 0.000995 0.000994 99.900537 5.0 1.791759 False
AL627309.3 ENSG00000239945 Gene Expression False 1 0.000199 0.000199 99.980107 1.0 0.693147 False
... ... ... ... ... ... ... ... ... ... ...
AC233755.2 ENSG00000277856 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC233755.1 ENSG00000275063 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
AC240274.1 ENSG00000271254 Gene Expression False 167 0.034613 0.034028 96.677939 174.0 5.164786 False
AC213203.1 ENSG00000277475 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False
FAM231C ENSG00000268674 Gene Expression False 0 0.000000 0.000000 100.000000 0.0 0.000000 False

33538 rows × 10 columns

In [18]:
scanpyObj.obs
Out[18]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGAGACTAGAT-1_DownD50 DownD50 3391B CTL01 2179 7.687080 6489.0 8.778018 83.0 4.430817 1.279088 1791.0 7.491087 27.600554
AAACCTGAGAGTTGGC-1_DownD50 DownD50 3391B CTL01 873 6.773080 2612.0 7.868254 4.0 1.609438 0.153139 1268.0 7.145985 48.545177
AAACCTGCACATGGGA-1_DownD50 DownD50 3391B CTL01 2382 7.776115 10693.0 9.277438 246.0 5.509388 2.300570 4988.0 8.514991 46.647339
AAACCTGCACGAGAGT-1_DownD50 DownD50 MIFF1 CTL02A 912 6.816736 1959.0 7.580700 26.0 3.295837 1.327208 741.0 6.609349 37.825420
AAACCTGCACGAGGTA-1_DownD50 DownD50 3391B CTL01 1816 7.504942 5844.0 8.673342 112.0 4.727388 1.916496 2148.0 7.672758 36.755650
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCAGTACCGCTG-1_DownD50 DownD50 3391B CTL01 3294 8.100161 11077.0 9.312716 296.0 5.693732 2.672204 3160.0 8.058643 28.527578
TTTGTCAGTCAAACTC-1_DownD50 DownD50 MIFF1 CTL02A 1172 7.067320 3124.0 8.047190 114.0 4.744932 3.649168 1231.0 7.116394 39.404610
TTTGTCAGTCACCTAA-1_DownD50 DownD50 3391B CTL01 1261 7.140453 3082.0 8.033658 45.0 3.828641 1.460091 671.0 6.510258 21.771578
TTTGTCAGTCTAGTCA-1_DownD50 DownD50 3391B CTL01 2987 8.002360 12697.0 9.449200 234.0 5.459586 1.842955 5049.0 8.527143 39.765297
TTTGTCAGTGCATCTA-1_DownD50 DownD50 3391B CTL01 3847 8.255309 19168.0 9.861050 347.0 5.852202 1.810309 7850.0 8.968396 40.953671

5027 rows × 13 columns

In [19]:
sc.pl.violin(scanpyObj, ['n_genes_by_counts', 'total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['total_counts_mt','total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [20]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [21]:
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='log1p_total_counts', y='log1p_total_counts_ribo')
In [22]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True, rotation=90)
In [23]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [24]:
sc.pl.violin(scanpyObj, ['log1p_n_genes_by_counts', 'log1p_total_counts'],
             groupby= "cellID", jitter=0.4, multi_panel=True)

sc.pl.violin(scanpyObj, ['log1p_total_counts_mt','log1p_total_counts_ribo'],
             groupby= "cellID", jitter=0.4, multi_panel=True)
In [25]:
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_n_genes_by_counts')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_mt')
sc.pl.scatter(scanpyObj, x='total_counts', y='log1p_total_counts_ribo')
In [26]:
scanpyObj.write_h5ad(outdir+'/adatas/'+DSname+'_raw.h5ad')
In [27]:
sc.pp.normalize_total(scanpyObj, exclude_highly_expressed=True, max_fraction=.1)
normalizing counts per cell The following highly-expressed genes are not considered during normalization factor computation:
['MALAT1']
    finished (0:00:00)
In [28]:
sc.pp.log1p(scanpyObj)
In [29]:
scanpyObj.raw = scanpyObj

Subset according to JointHVGs and Filtered Barcodes from previous step¶

In [30]:
HVGs=pd.read_csv(outdir+"/HVG_list_intersection_Curated.txt", sep = "\t")["HVG"]

scanpyObj = scanpyObj[:,HVGs]
scanpyObj.var["highly_variable"] = True
In [31]:
#sc.pp.highly_variable_genes(scanpyObj, min_mean=0.0125, max_mean=5, min_disp=0.5)
In [32]:
#scanpyObj = scanpyObj[:, HVG.tolist()]

#Multiplexing = Multiplexing[:, Multiplexing.var.highly_variable]
In [33]:
#sc.pl.highly_variable_genes(scanpyObj)

Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.

In [34]:
sc.pp.regress_out(scanpyObj, ['total_counts',"pct_counts_mt"])
regressing out ['total_counts', 'pct_counts_mt']
    sparse input is densified and may lead to high memory use
    finished (0:00:16)

Scale each gene to unit variance. Clip values exceeding standard deviation 10.

In [35]:
sc.pp.scale(scanpyObj, max_value=20)
In [36]:
scanpyObj.obs
Out[36]:
dataset cellID cellID_newName n_genes_by_counts log1p_n_genes_by_counts total_counts log1p_total_counts total_counts_mt log1p_total_counts_mt pct_counts_mt total_counts_ribo log1p_total_counts_ribo pct_counts_ribo
AAACCTGAGACTAGAT-1_DownD50 DownD50 3391B CTL01 2179 7.687080 6489.0 8.778018 83.0 4.430817 1.279088 1791.0 7.491087 27.600554
AAACCTGAGAGTTGGC-1_DownD50 DownD50 3391B CTL01 873 6.773080 2612.0 7.868254 4.0 1.609438 0.153139 1268.0 7.145985 48.545177
AAACCTGCACATGGGA-1_DownD50 DownD50 3391B CTL01 2382 7.776115 10693.0 9.277438 246.0 5.509388 2.300570 4988.0 8.514991 46.647339
AAACCTGCACGAGAGT-1_DownD50 DownD50 MIFF1 CTL02A 912 6.816736 1959.0 7.580700 26.0 3.295837 1.327208 741.0 6.609349 37.825420
AAACCTGCACGAGGTA-1_DownD50 DownD50 3391B CTL01 1816 7.504942 5844.0 8.673342 112.0 4.727388 1.916496 2148.0 7.672758 36.755650
... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTCAGTACCGCTG-1_DownD50 DownD50 3391B CTL01 3294 8.100161 11077.0 9.312716 296.0 5.693732 2.672204 3160.0 8.058643 28.527578
TTTGTCAGTCAAACTC-1_DownD50 DownD50 MIFF1 CTL02A 1172 7.067320 3124.0 8.047190 114.0 4.744932 3.649168 1231.0 7.116394 39.404610
TTTGTCAGTCACCTAA-1_DownD50 DownD50 3391B CTL01 1261 7.140453 3082.0 8.033658 45.0 3.828641 1.460091 671.0 6.510258 21.771578
TTTGTCAGTCTAGTCA-1_DownD50 DownD50 3391B CTL01 2987 8.002360 12697.0 9.449200 234.0 5.459586 1.842955 5049.0 8.527143 39.765297
TTTGTCAGTGCATCTA-1_DownD50 DownD50 3391B CTL01 3847 8.255309 19168.0 9.861050 347.0 5.852202 1.810309 7850.0 8.968396 40.953671

5027 rows × 13 columns

Principal component analysis¶

In [37]:
sc.tl.pca(scanpyObj, svd_solver='arpack')
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:01)
In [38]:
sc.pl.pca(scanpyObj, color='MKI67')
In [39]:
sc.pl.pca_variance_ratio(scanpyObj, log=True)
In [40]:
scanpyObj
Out[40]:
AnnData object with n_obs × n_vars = 5027 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

Computing the neighborhood graph¶

In [41]:
sc.pp.neighbors(scanpyObj, n_neighbors=10, n_pcs=9)
computing neighbors
    using 'X_pca' with n_pcs = 9
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:02)

Embedding the neighborhood graph¶

In [42]:
sc.tl.umap(scanpyObj)
scanpyObj
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:04)
Out[42]:
AnnData object with n_obs × n_vars = 5027 × 3499
    obs: 'dataset', 'cellID', 'cellID_newName', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo'
    var: 'gene_ids', 'feature_types', 'mt', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'ribo', 'highly_variable', 'mean', 'std'
    uns: 'cellID_colors', 'cellID_newName_colors', 'log1p', 'pca', 'neighbors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [43]:
sc.pl.umap(scanpyObj, color=['MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [44]:
sc.tl.diffmap(scanpyObj)
computing Diffusion Maps using n_comps=15(=n_dcs)
computing transitions
    finished (0:00:00)
    eigenvalues of transition matrix
    [1.         0.99485993 0.9908001  0.9835778  0.980123   0.9765549
     0.9758682  0.974007   0.9700164  0.9674469  0.96155995 0.96025586
     0.9564847  0.951052   0.9480252 ]
    finished: added
    'X_diffmap', diffmap coordinates (adata.obsm)
    'diffmap_evals', eigenvalues of transition matrix (adata.uns) (0:00:00)
In [45]:
sc.tl.dpt(scanpyObj)
WARNING: No root cell found. To compute pseudotime, pass the index or expression vector of a root cell, one of:
    adata.uns['iroot'] = root_cell_index
    adata.var['xroot'] = adata[root_cell_name, :].X
computing Diffusion Pseudotime using n_dcs=10
    finished: added
 (0:00:00)
In [46]:
sc.pl.diffmap(scanpyObj, color=[ 'MKI67', 'S100B',"NES","PAX6","DCX",'STMN2',"DCN", "BGN"])
In [47]:
scanpyObj.X.max()
Out[47]:
20.0